Demo¶

from gensim.models import Word2Vec
import _pickle as pickle
import tweepy

def load_obj(name ):
    with open( 'output/'+ name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
print("Loading Word Vector Model...")
model = Word2Vec.load("output/400features_30minwords_10context_twitter")

print("Loading Second Word Vector Model...")
model2 = Word2Vec.load("output/400features_30minwords_10context_twitter")

print("Loading Word Centroid Map...")
word_centroid_map = load_obj("twitter_word_centroid_map")

print("Loading Trained Random Forest Classifier...")
load_forest = load_obj("twitter_forest")

print("Setting up Twitter Authentication...")
consumer_key = "GlYCSvDgUet79gori1M5rxmMW"
consumer_secret = "JRNb6FIjsSMOu6CU4QRMdJ1kMsVd7IF6g9PnKgD2qrdeva2iFY"
access_token = "259205396-ZWx5lQCRzy5GMnzmNTIQzMckDqRnzjfVnoFu0VgG"
access_token_secret = "eBx4oQQYHhXRgQE6cOioMSUhzpLWNLc8c2hgL4GGmG2Kd"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

Loading Word Vector Model...
Loading Second Word Vector Model...
Loading Word Centroid Map...
Loading Trained Random Forest Classifier...
Setting up Twitter Authentication...

import re


def process_tweet( tweet , punctuation=False):
    
    tweet = re.sub('@[^\s]+','',tweet)    
    tweet = re.sub('((www\.[\s]+)|(https?:/?/?[^\s]+))','',tweet)
    tweet = tweet.replace('RT','')
    tweet = tweet.replace('#','')
    
    if punctuation:
        tweet = tweet.replace('.','')
        tweet = tweet.replace(',','')
        tweet = tweet.replace('?','')
        tweet = tweet.replace('!','')
        
    words = tweet.lower().split()    
    return( words)   

word_vectors = model.wv.syn0
num_clusters = int(word_vectors.shape[0] / 10)

def create_bag_of_centroids( wordlist, word_centroid_map ):
    
    num_centroids = max( word_centroid_map.values() ) + 1
    
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
            
    return bag_of_centroids

Word Vectors¶

for item in model.most_similar("awful"):
    print(item[0]," ",end="")
print()

terrible  horrible  horrid  aweful  icky  rubbish  incredible  unpleasant  miserable  overwhelming

Clustered Words¶

for cluster in range(77,80):

    print ("\nCluster %d" % cluster)
    words = []
    
    for i in range(0,len(word_centroid_map.values())):
    
        if( list(word_centroid_map.values())[i] == cluster ):
            words.append(list(word_centroid_map.keys())[i])
            
    print (words)

Cluster 77
['sister', 'brother', 'bro', 'sis', 'mama', 'wifey', 'neice']

Cluster 78
['speechless', 'blushing', 'stoned', 'embarassed', 'venting', 'panicking', 'cynical', 'ino', 'doubting', 'grinning']

Cluster 79
['av', 'ent']

Twitter Sentiment Analysis¶

query = "Star Wars"
max_tweets = 500

print ("Loading Tweets...")   
searched_tweets = [status.text for status in tweepy.Cursor(api.search, q=query, lang="en").items(max_tweets)]

import numpy as np
print ("Pre-allocating an Array...")   
user_centroids = np.zeros( (max_tweets, num_clusters), \
    dtype="float32" )
    
print ("Producing Test Centroids...")   
counter = 0
for tweet in searched_tweets:
    user_centroids[counter] = create_bag_of_centroids( process_tweet( tweet, True ), word_centroid_map )
    counter += 1

print ("Predicting Test Sets...")
result = load_forest.predict(user_centroids)

unique, counts = np.unique(result, return_counts=True)
result_dict = dict(zip(unique, counts))

print ("\nPrediction :")
print ("    Positive - %.2f%%,\n    Negative - %.2f%%" %
       (result_dict.get(4, 0)*100/len(result),\
       result_dict.get(0, 0)*100/len(result)))

Loading Tweets...
Pre-allocating an Array...
Producing Test Centroids...
Predicting Test Sets...

Prediction :
    Positive - 74.20%,
    Negative - 25.80%

def switch(x): 
    if x == 0:
        return 'Negative'
    else:
        return 'Positive'

formatted_result = list(map(switch, result))

output = list(zip(formatted_result,searched_tweets))

for item in output[0:10]:
    print(item[0]+"\n"+item[1]+"\n")

Negative
RT @MarkHarrisNYC: After 20 children were murdered in Sandy Hook, Paul Ryan said Obama was talking about it "to distract from basically his…

Positive
RT @Rosie: i agree - paul ryan must go along with this inhumane administration 
#RESIST ALL THINGS @TRUMP https://t.co/amzEZhZZvf

Positive
RT @Rosie: i agree - paul ryan must go along with this inhumane administration 
#RESIST ALL THINGS @TRUMP https://t.co/amzEZhZZvf

Negative
Yulin Dog Meat Festival Starts June 21, 2017 - Speaker of the House Paul Ryan Stop the Tortu... https://t.co/DLuGua8bna vía @ChangeorgLatino

Negative
RT @MarkHarrisNYC: After 20 children were murdered in Sandy Hook, Paul Ryan said Obama was talking about it "to distract from basically his…

Negative
RT @bbusa617: PAUL RYAN  Ran to Microphone to Condemn Trump Comments But Has Said NOTHING About Trump Assassination Play https://t.co/G5pRC…

Negative
RT @Kriquette01: @greenhousenyt @MikeAndy128 It is so unpopular, Mitch McConnell and Paul Ryan r in secret trying to pass it under the rada…

Negative
Although ... that Paul Ryan one https://t.co/dHSubBwtsx

Negative
RT @awkward_1110: Yulin Dog Meat Festival Starts June 21, 2017 - Speaker of the House Paul Ryan Stop the Torture ... https://t.co/fvCdDC4FQ…

Positive
RT @StefanMolyneux: Paul Ryan, Nancy Pelosi and Jake Tapper.

They might as well be wearing matching jerseys since they're already on the s…